# -*- coding: utf-8 -*-
import re
import scrapy
from ArticleSpider.items import ArticleSpiderItem
class JobboleSpider(scrapy.Spider):
    name = "jobbole"
    allowed_domains = ["blog.jobbole.com"]
    start_urls = ['http://top.jobbole.com/38569/']
    def parse(self, response):
        article_item = ArticleSpiderItem()
        #标题：
        title = response.xpath('//div[@class="media-body"]/h1/a/text()').extract_first()
        #发布日期：
        create_data = response.xpath('//div[@class="media-body"]/p/span[1]/text()').extract_first()
        #文章作者：
        author = response.xpath('//div[@class="media-body"]/p/span[3]/a[2]/text()').extract_first()
        #点赞数：
        praise_num = response.xpath('//div[@class="post-adds"]/span[1]/h10/text()').extract_first()
        #评论数：
        comment_num = response.xpath('//div[@class="media-body"]/p/span[4]/a/text()').extract_first()
        #用正则表达式只提取数字。
        match_re = re.match(".*?(\d+).*", comment_num)
        if match_re:
            comment_num = match_re.group(1)
        #提取正文内容
        content = response.xpath('//div[@class = "p-entry"]').extract_first()
        article_item["title"] = title
        article_item["create_data"] = create_data
        article_item["author"] = author
        article_item["praise_num"] = praise_num
        article_item["comment_num"] = comment_num
        article_item["content"] = content

        yield article_item



